#package
library(ggplot2)

#library part
library(ISLR2)
library(ltm)
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:ISLR2':
## 
##     Boston
## Loading required package: msm
## Loading required package: polycor
library(tree)
library(rpart)
library(rpart.plot)
library(caret)
## Loading required package: lattice
library(gbm)
## Loaded gbm 2.1.9
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
# EDA
library(dlookr)
## Registered S3 methods overwritten by 'dlookr':
##   method          from  
##   plot.transform  scales
##   print.transform scales
## 
## Attaching package: 'dlookr'
## The following object is masked from 'package:base':
## 
##     transform
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
## 
##     select
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#data part
data(OJ)
data("Caravan")

This problem involves the OJ data set which is part of the ISLR2 package.

EDA

head(OJ)
##   Purchase WeekofPurchase StoreID PriceCH PriceMM DiscCH DiscMM SpecialCH
## 1       CH            237       1    1.75    1.99   0.00    0.0         0
## 2       CH            239       1    1.75    1.99   0.00    0.3         0
## 3       CH            245       1    1.86    2.09   0.17    0.0         0
## 4       MM            227       1    1.69    1.69   0.00    0.0         0
## 5       CH            228       7    1.69    1.69   0.00    0.0         0
## 6       CH            230       7    1.69    1.99   0.00    0.0         0
##   SpecialMM  LoyalCH SalePriceMM SalePriceCH PriceDiff Store7 PctDiscMM
## 1         0 0.500000        1.99        1.75      0.24     No  0.000000
## 2         1 0.600000        1.69        1.75     -0.06     No  0.150754
## 3         0 0.680000        2.09        1.69      0.40     No  0.000000
## 4         0 0.400000        1.69        1.69      0.00     No  0.000000
## 5         0 0.956535        1.69        1.69      0.00    Yes  0.000000
## 6         1 0.965228        1.99        1.69      0.30    Yes  0.000000
##   PctDiscCH ListPriceDiff STORE
## 1  0.000000          0.24     1
## 2  0.000000          0.24     1
## 3  0.091398          0.23     1
## 4  0.000000          0.00     1
## 5  0.000000          0.00     0
## 6  0.000000          0.30     0
str(OJ)
## 'data.frame':    1070 obs. of  18 variables:
##  $ Purchase      : Factor w/ 2 levels "CH","MM": 1 1 1 2 1 1 1 1 1 1 ...
##  $ WeekofPurchase: num  237 239 245 227 228 230 232 234 235 238 ...
##  $ StoreID       : num  1 1 1 1 7 7 7 7 7 7 ...
##  $ PriceCH       : num  1.75 1.75 1.86 1.69 1.69 1.69 1.69 1.75 1.75 1.75 ...
##  $ PriceMM       : num  1.99 1.99 2.09 1.69 1.69 1.99 1.99 1.99 1.99 1.99 ...
##  $ DiscCH        : num  0 0 0.17 0 0 0 0 0 0 0 ...
##  $ DiscMM        : num  0 0.3 0 0 0 0 0.4 0.4 0.4 0.4 ...
##  $ SpecialCH     : num  0 0 0 0 0 0 1 1 0 0 ...
##  $ SpecialMM     : num  0 1 0 0 0 1 1 0 0 0 ...
##  $ LoyalCH       : num  0.5 0.6 0.68 0.4 0.957 ...
##  $ SalePriceMM   : num  1.99 1.69 2.09 1.69 1.69 1.99 1.59 1.59 1.59 1.59 ...
##  $ SalePriceCH   : num  1.75 1.75 1.69 1.69 1.69 1.69 1.69 1.75 1.75 1.75 ...
##  $ PriceDiff     : num  0.24 -0.06 0.4 0 0 0.3 -0.1 -0.16 -0.16 -0.16 ...
##  $ Store7        : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 2 2 2 2 2 ...
##  $ PctDiscMM     : num  0 0.151 0 0 0 ...
##  $ PctDiscCH     : num  0 0 0.0914 0 0 ...
##  $ ListPriceDiff : num  0.24 0.24 0.23 0 0 0.3 0.3 0.24 0.24 0.24 ...
##  $ STORE         : num  1 1 1 1 0 0 0 0 0 0 ...

data description) 오렌지 주스 브랜드인 Citrus Hill과 Minute Maid에 대한 데이터셋이다.

Attribute)
observations : 1070
variables : 18
1. Purchase : 소비자가 구매한 오렌지 주스의 브랜드 (CH 또는 MM)*
2. WeekofPurchase : 구입이 이뤄진 주차
3. StoreID : 상점 ID
4. PriceCH : 브랜드 CH의 가격*
5. PriceMM : 브랜드 MM의 가격*
6. DiscCH : 브랜드 CH의 할인 가격
7. DiscMM : 브랜드 MM의 할인 가격
8. SpecialCH : 브랜드 CH의 특별 행사 여부
9. SpecialMM : 브랜드 MM의 특별 행사 여부
10. LoyalCH : 브랜드 CH에 대한 고객 브랜드 충성도*
11. SalePriceMM : 브랜드 MM에 대한 할인된 가격(list price - DiscMM)
12. SalePriceCH : 브랜드 CH에 대한 할인된 가격(list price - DiscCH)
13. PriceDiff : 두 브랜드 간의 가격 차이(MM price - CH price)*
14. Store7 : 7번 상점에서 구매가 이뤄졌는지의 여부
15. PctDiscMM : MM의 할인율
16. PctDiscCH : CH의 할인율
17. ListPriceDiff : 두 브랜드간의 할인 전 가격 차이(MM -CH)*
18. STORE: 상점의 ID를 나타내는 변수, range : [0, 4]

summary(OJ)
##  Purchase WeekofPurchase     StoreID        PriceCH         PriceMM     
##  CH:653   Min.   :227.0   Min.   :1.00   Min.   :1.690   Min.   :1.690  
##  MM:417   1st Qu.:240.0   1st Qu.:2.00   1st Qu.:1.790   1st Qu.:1.990  
##           Median :257.0   Median :3.00   Median :1.860   Median :2.090  
##           Mean   :254.4   Mean   :3.96   Mean   :1.867   Mean   :2.085  
##           3rd Qu.:268.0   3rd Qu.:7.00   3rd Qu.:1.990   3rd Qu.:2.180  
##           Max.   :278.0   Max.   :7.00   Max.   :2.090   Max.   :2.290  
##      DiscCH            DiscMM         SpecialCH        SpecialMM     
##  Min.   :0.00000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.00000   Median :0.0000   Median :0.0000   Median :0.0000  
##  Mean   :0.05186   Mean   :0.1234   Mean   :0.1477   Mean   :0.1617  
##  3rd Qu.:0.00000   3rd Qu.:0.2300   3rd Qu.:0.0000   3rd Qu.:0.0000  
##  Max.   :0.50000   Max.   :0.8000   Max.   :1.0000   Max.   :1.0000  
##     LoyalCH          SalePriceMM     SalePriceCH      PriceDiff       Store7   
##  Min.   :0.000011   Min.   :1.190   Min.   :1.390   Min.   :-0.6700   No :714  
##  1st Qu.:0.325257   1st Qu.:1.690   1st Qu.:1.750   1st Qu.: 0.0000   Yes:356  
##  Median :0.600000   Median :2.090   Median :1.860   Median : 0.2300            
##  Mean   :0.565782   Mean   :1.962   Mean   :1.816   Mean   : 0.1465            
##  3rd Qu.:0.850873   3rd Qu.:2.130   3rd Qu.:1.890   3rd Qu.: 0.3200            
##  Max.   :0.999947   Max.   :2.290   Max.   :2.090   Max.   : 0.6400            
##    PctDiscMM        PctDiscCH       ListPriceDiff       STORE      
##  Min.   :0.0000   Min.   :0.00000   Min.   :0.000   Min.   :0.000  
##  1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.140   1st Qu.:0.000  
##  Median :0.0000   Median :0.00000   Median :0.240   Median :2.000  
##  Mean   :0.0593   Mean   :0.02731   Mean   :0.218   Mean   :1.631  
##  3rd Qu.:0.1127   3rd Qu.:0.00000   3rd Qu.:0.300   3rd Qu.:3.000  
##  Max.   :0.4020   Max.   :0.25269   Max.   :0.440   Max.   :4.000


CH : 653 MM : 417

<Store 7> No : 714 Yes : 356

상식적으로 생각해봤을 때, 구매에 있어서 가장 중요한 요소들은 각 제품의 가격과 경쟁 제품간의 가격 차이 그리고 브랜드에 대한 충성도라고 유추할 수 있다. 따라서, PriceCH, PriceMM, PriceDiff, LoyalCH 변수들을 중점적으로 과제를 진행한다.

ggplot(OJ, aes(x = PriceCH, y = PriceMM, color = Purchase)) +
  geom_point() +
  labs(title = "SalePriceCH and SalePriceMM",
       x = "Price_CH",
       y = "Price_MM",
       color = "Purchase") +
  theme_minimal()

위의 scatter plot은 Price_CH와 Price_MM의 관계에 따른 Purchase의 종류를 확인한 것 이다. 점들을 보면, 같은 색상끼리 모여서 분포해 있는 것을 확인할 수 있다. 브랜드별 Price는 Purchase를 예측하는 데 유의미하다고 결론을 내린다.

ggplot(OJ, aes(x = Purchase, y = LoyalCH)) +
  geom_violin(trim = FALSE) +
  geom_boxplot(width = 0.1) +
  labs(title = "Violin Plot of LoyalCH by Purchase",
       x = "Purchase",
       y = "LoyalCH") +
  theme_minimal()

CH에 대한 브랜드 충성도가 높을수록, CH의 제품 구매로 이뤄지는 경우가 많다는 사실을 확인했다. 따라서, LoyalCH는 Purchase 예측에 유의미한 예측변수로 사용될 수 있다는 결론을 내린다.

ggplot(OJ, aes(x = SalePriceCH, y = SalePriceMM, color = Purchase)) + 
   geom_point() +
  labs(title = "SalePriceCH and SalePriceMM",
       x = "SalePrice_CH",
       y = "SalePrice_MM",
       color = "Purchase") +
  theme_minimal()

같은 색상의 점들끼리 대체적으로 군집을 이루고 있는 모습을 확인할 수 있다. 브랜드별 SalePrice 역시 Purchase를 예측하는 데 유의미하다고 판단을 내린다.

ggplot(OJ, aes(x = Purchase, y = PriceDiff)) + geom_boxplot()

PriceDiff = MM price - CH price
MM이 CH보다 비싸다면, 사람들은 상대적으로 저렴한 CH를 선택한다는 것을 확인할 수 있었다. 가격의 차이가 Purchase를 예측하는 데 유의미하다고 판단할 수 있다.

diagnose_outlier(OJ)
##         variables outliers_cnt outliers_ratio outliers_mean    with_mean
## 1  WeekofPurchase            0       0.000000           NaN 254.38130841
## 2         StoreID            0       0.000000           NaN   3.95981308
## 3         PriceCH            0       0.000000           NaN   1.86742056
## 4         PriceMM           57       5.327103     1.6900000   2.08541121
## 5          DiscCH          232      21.682243     0.2391810   0.05185981
## 6          DiscMM           41       3.831776     0.7731707   0.12336449
## 7       SpecialCH          158      14.766355     1.0000000   0.14766355
## 8       SpecialMM          173      16.168224     1.0000000   0.16168224
## 9         LoyalCH            0       0.000000           NaN   0.56578233
## 10    SalePriceMM            0       0.000000           NaN   1.96204673
## 11    SalePriceCH           75       7.009346     1.4540000   1.81556075
## 12      PriceDiff           36       3.364486    -0.5947222   0.14648598
## 13      PctDiscMM           41       3.831776     0.3625908   0.05929844
## 14      PctDiscCH          232      21.682243     0.1259733   0.02731384
## 15  ListPriceDiff            0       0.000000           NaN   0.21799065
## 16          STORE            0       0.000000           NaN   1.63084112
##    without_mean
## 1  254.38130841
## 2    3.95981308
## 3    1.86742056
## 4    2.10766041
## 5    0.00000000
## 6    0.09747328
## 7    0.00000000
## 8    0.00000000
## 9    0.56578233
## 10   1.96204673
## 11   1.84281407
## 12   0.17229207
## 13   0.04721390
## 14   0.00000000
## 15   0.21799065
## 16   1.63084112


PriceMM : 57(ratio : 5%)
DiscCH : 232(ratio : 22%)
DiscMM : 41(ratio : 4%)
SalePriceCH : 75(ratio : 7%)
PriceDiff : 36(ratio : 3%)
PctDiscMM : 41(ratio : 4%)
PctDiscCH : 232(ratio : 22%)


DiscCH의 경우, 이상치의 비율이 22%가 넘는다. 높은 이상치를 갖는다는 사실을 확인할 수 있었다.
SalePriceCH의 경우도 이상치가 7%로 높은 수준을 보이지만, 이것은 DiscCH의 변수의 영향으로 보인다.
PctDiscCH 변수 역시, DiscCH에 종속적이기 때문에 22%의 높은 이상치 비율을 갖는다고 결론을 내릴 수 있다.

plot_outlier(OJ)

remove_outliers_IQR = function(data, column_name) {
    Q1 = quantile(data[[column_name]], 0.25, na.rm = TRUE)
    Q3 = quantile(data[[column_name]], 0.75, na.rm = TRUE)
    IQR = Q3 - Q1
    
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    cleaned_data = data[data[[column_name]] >= lower_bound & data[[column_name]] <= upper_bound, ]
    return(cleaned_data)
}

removed_OJ = remove_outliers_IQR(OJ, "DiscCH")
diagnose_outlier(removed_OJ)
##         variables outliers_cnt outliers_ratio outliers_mean    with_mean
## 1  WeekofPurchase            0      0.0000000           NaN 251.32935561
## 2         StoreID            0      0.0000000           NaN   3.69809069
## 3         PriceCH            0      0.0000000           NaN   1.85304296
## 4         PriceMM           51      6.0859189     1.6900000   2.08161098
## 5          DiscCH            0      0.0000000           NaN   0.00000000
## 6          DiscMM           41      4.8926014     0.7731707   0.12930788
## 7       SpecialCH           62      7.3985680     1.0000000   0.07398568
## 8       SpecialMM          156     18.6157518     1.0000000   0.18615752
## 9         LoyalCH            0      0.0000000           NaN   0.54701148
## 10    SalePriceMM            0      0.0000000           NaN   1.95230310
## 11    SalePriceCH            0      0.0000000           NaN   1.85304296
## 12      PriceDiff            7      0.8353222    -0.6700000   0.09926014
## 13      PctDiscMM           41      4.8926014     0.3625908   0.06224205
## 14      PctDiscCH            0      0.0000000           NaN   0.00000000
## 15  ListPriceDiff           92     10.9785203     0.0000000   0.22856802
## 16          STORE            0      0.0000000           NaN   1.66825776
##    without_mean
## 1  251.32935561
## 2    3.69809069
## 3    1.85304296
## 4    2.10698856
## 5    0.00000000
## 6    0.09618570
## 7    0.00000000
## 8    0.00000000
## 9    0.54701148
## 10   1.95230310
## 11   1.85304296
## 12   0.10574007
## 13   0.04679124
## 14   0.00000000
## 15   0.25675603
## 16   1.66825776

DiscCH 변수의 이상치를 제거함으로써 SalePriceCH와 PctDiscCH의 outlier가 완전히 제거된 것을 확인할 수 있다.

plot_normality(removed_OJ)

Q-Q plot을 통해서 확인해 본 결과, 모든 데이터들이 정규 분포를 따르지는 않았다. log transformation과 sqrt transformation을 한 이후에도 정규분포의 형태는 찾아보기 힘들었다.

correlate(OJ)
## # A tibble: 240 × 3
##    var1        var2           coef_corr
##    <fct>       <fct>              <dbl>
##  1 StoreID     WeekofPurchase    0.0960
##  2 PriceCH     WeekofPurchase    0.704 
##  3 PriceMM     WeekofPurchase    0.577 
##  4 DiscCH      WeekofPurchase    0.366 
##  5 DiscMM      WeekofPurchase    0.242 
##  6 SpecialCH   WeekofPurchase    0.108 
##  7 SpecialMM   WeekofPurchase    0.0707
##  8 LoyalCH     WeekofPurchase    0.193 
##  9 SalePriceMM WeekofPurchase    0.102 
## 10 SalePriceCH WeekofPurchase    0.201 
## # ℹ 230 more rows
SpecialCH vs SpecialMM : histogram
specialCH_cnt = sum(OJ$SpecialCH == 1)
specialMM_cnt = sum(OJ$SpecialMM == 1)

special_cnts = data.frame(
  variable = c("SpecialCH", "SpecialMM"),
  count = c(specialCH_cnt, specialMM_cnt)
)
ggplot(special_cnts, aes(x = variable, y = count, fill = variable)) +
  geom_bar(stat = "identity", color = "black") +
  labs(title = "Comparison of Indicator of special",
      x = "Variable", y = "Count") +
  scale_fill_manual(values = c("SpecialCH" = "blue", "SpecialMM" = "orange")) +
  theme_minimal()

두 브랜드의 할인 횟수에 대한 빈도는 큰 차이를 보이지 않았다.

# remove missing value
removed_OJ = removed_OJ %>% filter(complete.cases(.))
categ = target_by(removed_OJ, Purchase)
cat_num = relate(categ, PriceCH)
plot(cat_num)

가격범위가 [1.8, 1.9] 구간에서는 CH가 우세하다는 것을 확인할 수 있었다.

(a) Create a training set containing a random sample of 800 observations, and a test set containing the remaining observations.

nrow(OJ)
## [1] 1070
removed_OJ = remove_outliers_IQR(OJ, "DiscCH")
cat("size of removed_OJ : ", nrow(removed_OJ))
## size of removed_OJ :  838

결측치와 이상치를 제거하고 남은 observations : 838개

#split the data into two part.(train set : test set)
set.seed(111)
train_index = createDataPartition(removed_OJ$Purchase, p = 800 / nrow(removed_OJ), list = FALSE)

# 훈련 세트와 테스트 세트 생성
train_set = removed_OJ[train_index, ]
test_set = removed_OJ[-train_index, ]

cat("size of training set:", nrow(train_set), "\n")
## size of training set: 801
cat("size of test set:", nrow(test_set), "\n")
## size of test set: 37
cat("<ratio> \n")
## <ratio>
cat("training set : test set = ",nrow(train_set) / nrow(removed_OJ) ," : ", nrow(test_set) / nrow(removed_OJ))
## training set : test set =  0.9558473  :  0.04415274

training set의 observation을 800으로 하고 나머지 observation을 test set으로 사용한 결과, [training set : test set = 0.96 : 0.04] 로 test set이 상당히 부족했다. 이것은 결측치와 이상치를 제거한 결과이다.

(b) Fit a tree to the training data, with Purchase as the response and the other variables as predictors. Use the summary() function to produce summary statistics about the tree, and describe the results obtained. What is the training error rate? How many terminal nodes does the tree have?

#모든 변수 사용
tree_model_all = rpart(Purchase ~ ., data  = train_set, method="class")
#일부 변수만 선택해서 사용
tree_model_selected = rpart(Purchase ~ PriceCH + PriceMM + PriceDiff + LoyalCH,
                            data = train_set, method = "class")
모든 변수를 사용한 경우 : Purchase ~ .
#모든 변수 사용한 tree
summary(tree_model_all)
## Call:
## rpart(formula = Purchase ~ ., data = train_set, method = "class")
##   n= 801 
## 
##           CP nsplit rel error    xerror       xstd
## 1 0.53801170      0 1.0000000 1.0000000 0.04093333
## 2 0.02485380      1 0.4619883 0.5000000 0.03390986
## 3 0.01461988      3 0.4122807 0.5058480 0.03405340
## 4 0.01000000      4 0.3976608 0.4853801 0.03354274
## 
## Variable importance
##        LoyalCH  ListPriceDiff      PriceDiff        PriceMM    SalePriceMM 
##             66              9              6              4              4 
## WeekofPurchase         DiscMM        StoreID        PriceCH    SalePriceCH 
##              4              2              2              1              1 
## 
## Node number 1: 801 observations,    complexity param=0.5380117
##   predicted class=CH  expected loss=0.4269663  P(node) =1
##     class counts:   459   342
##    probabilities: 0.573 0.427 
##   left son=2 (483 obs) right son=3 (318 obs)
##   Primary splits:
##       LoyalCH     < 0.48285   to the right, improve=138.47760, (0 missing)
##       StoreID     < 3.5       to the right, improve= 26.90500, (0 missing)
##       PriceDiff   < 0.015     to the right, improve= 19.64897, (0 missing)
##       SalePriceMM < 1.84      to the right, improve= 19.58088, (0 missing)
##       DiscMM      < 0.47      to the left,  improve= 17.82679, (0 missing)
##   Surrogate splits:
##       StoreID        < 3.5       to the right, agree=0.614, adj=0.028, (0 split)
##       PriceMM        < 1.89      to the right, agree=0.612, adj=0.022, (0 split)
##       ListPriceDiff  < 0.035     to the right, agree=0.610, adj=0.019, (0 split)
##       WeekofPurchase < 227.5     to the right, agree=0.609, adj=0.016, (0 split)
##       DiscMM         < 0.57      to the left,  agree=0.609, adj=0.016, (0 split)
## 
## Node number 2: 483 observations,    complexity param=0.0248538
##   predicted class=CH  expected loss=0.1884058  P(node) =0.6029963
##     class counts:   392    91
##    probabilities: 0.812 0.188 
##   left son=4 (277 obs) right son=5 (206 obs)
##   Primary splits:
##       LoyalCH     < 0.705699  to the right, improve=16.46708, (0 missing)
##       SalePriceMM < 1.84      to the right, improve=15.70711, (0 missing)
##       PriceDiff   < 0.015     to the right, improve=14.62609, (0 missing)
##       DiscMM      < 0.47      to the left,  improve=12.78514, (0 missing)
##       PctDiscMM   < 0.227263  to the left,  improve=12.78514, (0 missing)
##   Surrogate splits:
##       WeekofPurchase < 237.5     to the right, agree=0.631, adj=0.136, (0 split)
##       PriceCH        < 1.755     to the right, agree=0.625, adj=0.121, (0 split)
##       SalePriceCH    < 1.755     to the right, agree=0.625, adj=0.121, (0 split)
##       PriceMM        < 2.04      to the right, agree=0.621, adj=0.112, (0 split)
##       SpecialMM      < 0.5       to the left,  agree=0.598, adj=0.058, (0 split)
## 
## Node number 3: 318 observations
##   predicted class=MM  expected loss=0.2106918  P(node) =0.3970037
##     class counts:    67   251
##    probabilities: 0.211 0.789 
## 
## Node number 4: 277 observations
##   predicted class=CH  expected loss=0.07581227  P(node) =0.3458177
##     class counts:   256    21
##    probabilities: 0.924 0.076 
## 
## Node number 5: 206 observations,    complexity param=0.0248538
##   predicted class=CH  expected loss=0.3398058  P(node) =0.2571785
##     class counts:   136    70
##    probabilities: 0.660 0.340 
##   left son=10 (131 obs) right son=11 (75 obs)
##   Primary splits:
##       ListPriceDiff < 0.235     to the right, improve=17.647740, (0 missing)
##       SalePriceMM   < 1.84      to the right, improve=16.049010, (0 missing)
##       PriceDiff     < 0.145     to the right, improve=15.304000, (0 missing)
##       DiscMM        < 0.15      to the left,  improve= 8.961921, (0 missing)
##       PctDiscMM     < 0.0729725 to the left,  improve= 8.961921, (0 missing)
##   Surrogate splits:
##       PriceDiff      < 0.235     to the right, agree=0.859, adj=0.613, (0 split)
##       SalePriceMM    < 1.94      to the right, agree=0.777, adj=0.387, (0 split)
##       PriceMM        < 1.89      to the right, agree=0.748, adj=0.307, (0 split)
##       WeekofPurchase < 273.5     to the left,  agree=0.733, adj=0.267, (0 split)
##       DiscMM         < 0.47      to the left,  agree=0.709, adj=0.200, (0 split)
## 
## Node number 10: 131 observations
##   predicted class=CH  expected loss=0.1832061  P(node) =0.1635456
##     class counts:   107    24
##    probabilities: 0.817 0.183 
## 
## Node number 11: 75 observations,    complexity param=0.01461988
##   predicted class=MM  expected loss=0.3866667  P(node) =0.09363296
##     class counts:    29    46
##    probabilities: 0.387 0.613 
##   left son=22 (23 obs) right son=23 (52 obs)
##   Primary splits:
##       PriceDiff   < 0.015     to the right, improve=3.270658, (0 missing)
##       SalePriceMM < 1.84      to the right, improve=3.150053, (0 missing)
##       PctDiscMM   < 0.1961965 to the left,  improve=3.055152, (0 missing)
##       DiscMM      < 0.15      to the left,  improve=2.350005, (0 missing)
##       LoyalCH     < 0.5976    to the right, improve=2.084961, (0 missing)
##   Surrogate splits:
##       SalePriceMM    < 1.84      to the right, agree=0.973, adj=0.913, (0 split)
##       ListPriceDiff  < 0.18      to the right, agree=0.787, adj=0.304, (0 split)
##       WeekofPurchase < 275.5     to the right, agree=0.733, adj=0.130, (0 split)
##       PctDiscMM      < 0.0980985 to the left,  agree=0.733, adj=0.130, (0 split)
##       DiscMM         < 0.15      to the left,  agree=0.707, adj=0.043, (0 split)
## 
## Node number 22: 23 observations
##   predicted class=CH  expected loss=0.3913043  P(node) =0.02871411
##     class counts:    14     9
##    probabilities: 0.609 0.391 
## 
## Node number 23: 52 observations
##   predicted class=MM  expected loss=0.2884615  P(node) =0.06491885
##     class counts:    15    37
##    probabilities: 0.288 0.712
print(tree_model_all$variable.importance)
##        LoyalCH  ListPriceDiff      PriceDiff        PriceMM    SalePriceMM 
##    154.9446916     21.2559470     14.0946076     10.2987813      9.8100472 
## WeekofPurchase         DiscMM        StoreID        PriceCH    SalePriceCH 
##      9.5482370      5.8490724      3.9191778      1.9984316      1.9984316 
##      SpecialMM      PctDiscMM 
##      0.9592471      0.4266075

모든 변수를 사용해서 model을 만들었을 때, LoyalCH가 66으로 모델에서 가장 많이 사용되었다.

train_pred_all = predict(tree_model_all, train_set, type = "class")
test_pred_all = predict(tree_model_all, test_set, type = "class")

train_conf_matrix_all = table(train_set$Purchase, train_pred_all)
test_conf_matrix_all = table(test_set$Purchase, test_pred_all)

# 훈련 오류율 계산
train_error_rate = 1 - sum(diag(train_conf_matrix_all)) / sum(train_conf_matrix_all)
test_error_rate = 1 - sum(diag(test_conf_matrix_all)) / sum(test_conf_matrix_all)

cat("train Error Rate:", train_error_rate, "\n")
## train Error Rate: 0.1697878
cat("test Error Rate:", test_error_rate, "\n")
## test Error Rate: 0.2162162
# 단말 노드의 개수 확인
num_terminal_nodes = sum(tree_model_all$frame$var == "<leaf>")
cat("Number of Terminal Nodes:", num_terminal_nodes, "\n")
## Number of Terminal Nodes: 5

train Error Rate : 17%
test Error Rate : 22%
number of leaf nodes : 5

일부 변수만을 사용한 경우 : Purchase ~ PriceCH + PriceMM + PriceDiff + LoyalCH
summary(tree_model_selected)
## Call:
## rpart(formula = Purchase ~ PriceCH + PriceMM + PriceDiff + LoyalCH, 
##     data = train_set, method = "class")
##   n= 801 
## 
##           CP nsplit rel error    xerror       xstd
## 1 0.53801170      0 1.0000000 1.0000000 0.04093333
## 2 0.01461988      1 0.4619883 0.4766082 0.03331668
## 3 0.01000000      5 0.3976608 0.4619883 0.03293000
## 
## Variable importance
##   LoyalCH PriceDiff   PriceMM   PriceCH 
##        80        11         6         3 
## 
## Node number 1: 801 observations,    complexity param=0.5380117
##   predicted class=CH  expected loss=0.4269663  P(node) =1
##     class counts:   459   342
##    probabilities: 0.573 0.427 
##   left son=2 (483 obs) right son=3 (318 obs)
##   Primary splits:
##       LoyalCH   < 0.48285   to the right, improve=138.477600, (0 missing)
##       PriceDiff < 0.015     to the right, improve= 19.648970, (0 missing)
##       PriceMM   < 1.89      to the right, improve=  6.509978, (0 missing)
##       PriceCH   < 1.925     to the left,  improve=  1.893242, (0 missing)
##   Surrogate splits:
##       PriceMM   < 1.89      to the right, agree=0.612, adj=0.022, (0 split)
##       PriceDiff < -0.575    to the right, agree=0.608, adj=0.013, (0 split)
## 
## Node number 2: 483 observations,    complexity param=0.01461988
##   predicted class=CH  expected loss=0.1884058  P(node) =0.6029963
##     class counts:   392    91
##    probabilities: 0.812 0.188 
##   left son=4 (277 obs) right son=5 (206 obs)
##   Primary splits:
##       LoyalCH   < 0.705699  to the right, improve=16.467080, (0 missing)
##       PriceDiff < 0.015     to the right, improve=14.626090, (0 missing)
##       PriceMM   < 1.74      to the right, improve= 5.876976, (0 missing)
##       PriceCH   < 1.72      to the right, improve= 1.927536, (0 missing)
##   Surrogate splits:
##       PriceCH   < 1.755     to the right, agree=0.625, adj=0.121, (0 split)
##       PriceMM   < 2.04      to the right, agree=0.621, adj=0.112, (0 split)
##       PriceDiff < 0.015     to the right, agree=0.588, adj=0.034, (0 split)
## 
## Node number 3: 318 observations
##   predicted class=MM  expected loss=0.2106918  P(node) =0.3970037
##     class counts:    67   251
##    probabilities: 0.211 0.789 
## 
## Node number 4: 277 observations
##   predicted class=CH  expected loss=0.07581227  P(node) =0.3458177
##     class counts:   256    21
##    probabilities: 0.924 0.076 
## 
## Node number 5: 206 observations,    complexity param=0.01461988
##   predicted class=CH  expected loss=0.3398058  P(node) =0.2571785
##     class counts:   136    70
##    probabilities: 0.660 0.340 
##   left son=10 (114 obs) right son=11 (92 obs)
##   Primary splits:
##       PriceDiff < 0.145     to the right, improve=15.3040000, (0 missing)
##       PriceMM   < 2.155     to the right, improve= 4.0255650, (0 missing)
##       LoyalCH   < 0.6840375 to the left,  improve= 2.8009220, (0 missing)
##       PriceCH   < 1.72      to the right, improve= 0.7231615, (0 missing)
##   Surrogate splits:
##       PriceMM < 2.04      to the right, agree=0.723, adj=0.380, (0 split)
##       PriceCH < 1.755     to the right, agree=0.626, adj=0.163, (0 split)
##       LoyalCH < 0.6955035 to the left,  agree=0.568, adj=0.033, (0 split)
## 
## Node number 10: 114 observations
##   predicted class=CH  expected loss=0.1666667  P(node) =0.1423221
##     class counts:    95    19
##    probabilities: 0.833 0.167 
## 
## Node number 11: 92 observations,    complexity param=0.01461988
##   predicted class=MM  expected loss=0.4456522  P(node) =0.1148564
##     class counts:    41    51
##    probabilities: 0.446 0.554 
##   left son=22 (59 obs) right son=23 (33 obs)
##   Primary splits:
##       PriceDiff < -0.165    to the right, improve=4.2505640, (0 missing)
##       PriceCH   < 1.755     to the left,  improve=1.4818630, (0 missing)
##       LoyalCH   < 0.5039495 to the right, improve=1.3921350, (0 missing)
##       PriceMM   < 2.11      to the left,  improve=0.5869565, (0 missing)
##   Surrogate splits:
##       PriceCH < 1.91      to the left,  agree=0.783, adj=0.394, (0 split)
##       PriceMM < 2.11      to the left,  agree=0.717, adj=0.212, (0 split)
##       LoyalCH < 0.6111595 to the left,  agree=0.652, adj=0.030, (0 split)
## 
## Node number 22: 59 observations,    complexity param=0.01461988
##   predicted class=CH  expected loss=0.440678  P(node) =0.07365793
##     class counts:    33    26
##    probabilities: 0.559 0.441 
##   left son=44 (42 obs) right son=45 (17 obs)
##   Primary splits:
##       LoyalCH   < 0.5039495 to the right, improve=2.034326, (0 missing)
##       PriceMM   < 1.74      to the right, improve=1.538646, (0 missing)
##       PriceDiff < -0.09     to the left,  improve=1.374719, (0 missing)
##       PriceCH   < 1.72      to the right, improve=1.017856, (0 missing)
## 
## Node number 23: 33 observations
##   predicted class=MM  expected loss=0.2424242  P(node) =0.0411985
##     class counts:     8    25
##    probabilities: 0.242 0.758 
## 
## Node number 44: 42 observations
##   predicted class=CH  expected loss=0.3571429  P(node) =0.05243446
##     class counts:    27    15
##    probabilities: 0.643 0.357 
## 
## Node number 45: 17 observations
##   predicted class=MM  expected loss=0.3529412  P(node) =0.02122347
##     class counts:     6    11
##    probabilities: 0.353 0.647
print(tree_model_selected$variable.importance)
##    LoyalCH  PriceDiff    PriceMM    PriceCH 
## 157.606866  21.855978  11.610614   6.168113

변수의 일부를 선택해서 model을 생성했을 때도, LoyalCH 변수의 중요도가 높은 것을 확인할 수 있었다. 소비자들의 제품 선택에 있어서 브랜드 충성도가 가장 중요한 요소라는 것을 알 수 있었다.

train_pred_selected = predict(tree_model_selected, train_set, type = "class")
test_pred_selected = predict(tree_model_selected, test_set, type = "class")

train_conf_matrix_selected = table(train_set$Purchase, train_pred_all)
test_conf_matrix_selected = table(test_set$Purchase, test_pred_all)

# 훈련 오류율 계산
train_error_rate = 1 - sum(diag(train_conf_matrix_selected)) / sum(train_conf_matrix_selected)
test_error_rate_ = 1 - sum(diag(test_conf_matrix_selected)) / sum(test_conf_matrix_selected)

cat("train Error Rate:", train_error_rate, "\n")
## train Error Rate: 0.1697878
cat("test Error Rate:", test_error_rate, "\n")
## test Error Rate: 0.2162162
# 단말 노드의 개수 확인
num_terminal_nodes = sum(tree_model_selected$frame$var == "<leaf>")
cat("Number of Terminal Nodes:", num_terminal_nodes, "\n")
## Number of Terminal Nodes: 6

변수를 선택해서 넣은 경우의 training error rate: 17%, test error rate: 22%로 모든 변수를 넣은 경우와 유의미한 차이를 보이지 않았다. 다만, leaf node의 수는 변수를 선택해서 넣은 경우가 6개로 모든 변수를 사용했을 때 보다 1개 더 많은 것을 확인할 수 있었다.

node 내 class 분포의 미세한 변화에 덜 민감하게 반응하는 classification error rate 지표의 특성 때문에 2가지 모델에서 동일한 error rate이 얻어진 것이라고 생각해서 Gini- Index와 Cross-Entropy 지표도 확인하기로 한다.

두 모델이 유의미한 결과의 차이를 보이지 않는 또다른 이유로는 test data set의 부족때문이라고 예상할 수 있다.

calc_gini_entropy = function(data, model) {
  pred = predict(model, data, type = "prob")
  actual = data$Purchase
  
  gini = 0
  cross_entropy = 0
  
  levels = levels(actual)
  
  for (i in seq_along(levels)) {
    level = levels[i]
    actual_binary = ifelse(actual == level, 1, 0)
    pred_prob = pred[, level]
    
    gini = gini + mean((pred_prob * (1 - pred_prob)))
    
    cross_entropy = cross_entropy - mean(actual_binary * log(pred_prob + 1e-10))
  }
  
  return(list(Gini = gini, CrossEntropy = cross_entropy))
}
train_results_all = calc_gini_entropy(train_set, tree_model_all)
test_results_all = calc_gini_entropy(test_set, tree_model_all)

train_results_selected = calc_gini_entropy(train_set, tree_model_selected)
test_results_selected = calc_gini_entropy(test_set, tree_model_selected)

cat("<Tree model all> \n",
    "**train_set** \n",
    "Training Set Gini Index:", train_results_all$Gini, "\n" , 
    "Training Set Cross-Entropy:", train_results_all$CrossEntropy, "\n")
## <Tree model all> 
##  **train_set** 
##  Training Set Gini Index: 0.2697777 
##  Training Set Cross-Entropy: 0.4333347
cat("**test_set** \n",
    "Test Set Gini Index:", test_results_all$Gini, "\n",
    "Test Set Cross-Entropy:", test_results_all$CrossEntropy, "\n")
## **test_set** 
##  Test Set Gini Index: 0.2603624 
##  Test Set Cross-Entropy: 0.5127495
cat("\n <Tree model selected> \n **train_set**\n",
    "Train Set Gini Index:", train_results_selected$Gini, "\n",
    "Train Set Cross-Entropy:", train_results_selected$CrossEntropy, "\n")
## 
##  <Tree model selected> 
##  **train_set**
##  Train Set Gini Index: 0.2689407 
##  Train Set Cross-Entropy: 0.4321273
cat("**test_set** \n",
    "Test Set Gini Index:", test_results_selected$Gini, "\n",
    "Test Set Cross-Entropy:", test_results_selected$CrossEntropy, "\n")
## **test_set** 
##  Test Set Gini Index: 0.2503403 
##  Test Set Cross-Entropy: 0.5086116

2가지 모델 전부, test set 에 대해서는 gini index와 cross entropy의 수치가 높아지는 것을 확인할 수 있다. 미세한 차이지만 predictor를 일부만 사용한 경우가 모든 predictor를 사용했을 때보다 낮은 것을 볼 수 있다.

(c) Type in the name of the tree object in order to get a detailed text output. Pick one of the terminal nodes, and interpret the information displayed.

tree_model_all : name of the tree object
summary(tree_model_all)
## Call:
## rpart(formula = Purchase ~ ., data = train_set, method = "class")
##   n= 801 
## 
##           CP nsplit rel error    xerror       xstd
## 1 0.53801170      0 1.0000000 1.0000000 0.04093333
## 2 0.02485380      1 0.4619883 0.5000000 0.03390986
## 3 0.01461988      3 0.4122807 0.5058480 0.03405340
## 4 0.01000000      4 0.3976608 0.4853801 0.03354274
## 
## Variable importance
##        LoyalCH  ListPriceDiff      PriceDiff        PriceMM    SalePriceMM 
##             66              9              6              4              4 
## WeekofPurchase         DiscMM        StoreID        PriceCH    SalePriceCH 
##              4              2              2              1              1 
## 
## Node number 1: 801 observations,    complexity param=0.5380117
##   predicted class=CH  expected loss=0.4269663  P(node) =1
##     class counts:   459   342
##    probabilities: 0.573 0.427 
##   left son=2 (483 obs) right son=3 (318 obs)
##   Primary splits:
##       LoyalCH     < 0.48285   to the right, improve=138.47760, (0 missing)
##       StoreID     < 3.5       to the right, improve= 26.90500, (0 missing)
##       PriceDiff   < 0.015     to the right, improve= 19.64897, (0 missing)
##       SalePriceMM < 1.84      to the right, improve= 19.58088, (0 missing)
##       DiscMM      < 0.47      to the left,  improve= 17.82679, (0 missing)
##   Surrogate splits:
##       StoreID        < 3.5       to the right, agree=0.614, adj=0.028, (0 split)
##       PriceMM        < 1.89      to the right, agree=0.612, adj=0.022, (0 split)
##       ListPriceDiff  < 0.035     to the right, agree=0.610, adj=0.019, (0 split)
##       WeekofPurchase < 227.5     to the right, agree=0.609, adj=0.016, (0 split)
##       DiscMM         < 0.57      to the left,  agree=0.609, adj=0.016, (0 split)
## 
## Node number 2: 483 observations,    complexity param=0.0248538
##   predicted class=CH  expected loss=0.1884058  P(node) =0.6029963
##     class counts:   392    91
##    probabilities: 0.812 0.188 
##   left son=4 (277 obs) right son=5 (206 obs)
##   Primary splits:
##       LoyalCH     < 0.705699  to the right, improve=16.46708, (0 missing)
##       SalePriceMM < 1.84      to the right, improve=15.70711, (0 missing)
##       PriceDiff   < 0.015     to the right, improve=14.62609, (0 missing)
##       DiscMM      < 0.47      to the left,  improve=12.78514, (0 missing)
##       PctDiscMM   < 0.227263  to the left,  improve=12.78514, (0 missing)
##   Surrogate splits:
##       WeekofPurchase < 237.5     to the right, agree=0.631, adj=0.136, (0 split)
##       PriceCH        < 1.755     to the right, agree=0.625, adj=0.121, (0 split)
##       SalePriceCH    < 1.755     to the right, agree=0.625, adj=0.121, (0 split)
##       PriceMM        < 2.04      to the right, agree=0.621, adj=0.112, (0 split)
##       SpecialMM      < 0.5       to the left,  agree=0.598, adj=0.058, (0 split)
## 
## Node number 3: 318 observations
##   predicted class=MM  expected loss=0.2106918  P(node) =0.3970037
##     class counts:    67   251
##    probabilities: 0.211 0.789 
## 
## Node number 4: 277 observations
##   predicted class=CH  expected loss=0.07581227  P(node) =0.3458177
##     class counts:   256    21
##    probabilities: 0.924 0.076 
## 
## Node number 5: 206 observations,    complexity param=0.0248538
##   predicted class=CH  expected loss=0.3398058  P(node) =0.2571785
##     class counts:   136    70
##    probabilities: 0.660 0.340 
##   left son=10 (131 obs) right son=11 (75 obs)
##   Primary splits:
##       ListPriceDiff < 0.235     to the right, improve=17.647740, (0 missing)
##       SalePriceMM   < 1.84      to the right, improve=16.049010, (0 missing)
##       PriceDiff     < 0.145     to the right, improve=15.304000, (0 missing)
##       DiscMM        < 0.15      to the left,  improve= 8.961921, (0 missing)
##       PctDiscMM     < 0.0729725 to the left,  improve= 8.961921, (0 missing)
##   Surrogate splits:
##       PriceDiff      < 0.235     to the right, agree=0.859, adj=0.613, (0 split)
##       SalePriceMM    < 1.94      to the right, agree=0.777, adj=0.387, (0 split)
##       PriceMM        < 1.89      to the right, agree=0.748, adj=0.307, (0 split)
##       WeekofPurchase < 273.5     to the left,  agree=0.733, adj=0.267, (0 split)
##       DiscMM         < 0.47      to the left,  agree=0.709, adj=0.200, (0 split)
## 
## Node number 10: 131 observations
##   predicted class=CH  expected loss=0.1832061  P(node) =0.1635456
##     class counts:   107    24
##    probabilities: 0.817 0.183 
## 
## Node number 11: 75 observations,    complexity param=0.01461988
##   predicted class=MM  expected loss=0.3866667  P(node) =0.09363296
##     class counts:    29    46
##    probabilities: 0.387 0.613 
##   left son=22 (23 obs) right son=23 (52 obs)
##   Primary splits:
##       PriceDiff   < 0.015     to the right, improve=3.270658, (0 missing)
##       SalePriceMM < 1.84      to the right, improve=3.150053, (0 missing)
##       PctDiscMM   < 0.1961965 to the left,  improve=3.055152, (0 missing)
##       DiscMM      < 0.15      to the left,  improve=2.350005, (0 missing)
##       LoyalCH     < 0.5976    to the right, improve=2.084961, (0 missing)
##   Surrogate splits:
##       SalePriceMM    < 1.84      to the right, agree=0.973, adj=0.913, (0 split)
##       ListPriceDiff  < 0.18      to the right, agree=0.787, adj=0.304, (0 split)
##       WeekofPurchase < 275.5     to the right, agree=0.733, adj=0.130, (0 split)
##       PctDiscMM      < 0.0980985 to the left,  agree=0.733, adj=0.130, (0 split)
##       DiscMM         < 0.15      to the left,  agree=0.707, adj=0.043, (0 split)
## 
## Node number 22: 23 observations
##   predicted class=CH  expected loss=0.3913043  P(node) =0.02871411
##     class counts:    14     9
##    probabilities: 0.609 0.391 
## 
## Node number 23: 52 observations
##   predicted class=MM  expected loss=0.2884615  P(node) =0.06491885
##     class counts:    15    37
##    probabilities: 0.288 0.712

10번노드를 선택해서 정보를 해석 해보려 한다.

Node number : 10
131 observations
predicted class=CH expected loss=0.1832061 P(node) =0.1635456
class counts: 107 24
probabilities: 0.817 0.183

해당 노드에는 총 131개의 관측치가 존재한다.
해당 노드에서 예측된 클래스는 ‘CH’ 이다.
이 노드의 예측 손실은 18% 이다.
P(node) = 0.1635456 으로 이 노드가 전체 데이터에서 차지하는 비율을 의미한다.
‘CH’ 클래스의 확률은 81.7%, ‘MM’ 클래스의 확률은 18.3%가 된다.

tree_model_selected : name of the tree object
summary(tree_model_selected)
## Call:
## rpart(formula = Purchase ~ PriceCH + PriceMM + PriceDiff + LoyalCH, 
##     data = train_set, method = "class")
##   n= 801 
## 
##           CP nsplit rel error    xerror       xstd
## 1 0.53801170      0 1.0000000 1.0000000 0.04093333
## 2 0.01461988      1 0.4619883 0.4766082 0.03331668
## 3 0.01000000      5 0.3976608 0.4619883 0.03293000
## 
## Variable importance
##   LoyalCH PriceDiff   PriceMM   PriceCH 
##        80        11         6         3 
## 
## Node number 1: 801 observations,    complexity param=0.5380117
##   predicted class=CH  expected loss=0.4269663  P(node) =1
##     class counts:   459   342
##    probabilities: 0.573 0.427 
##   left son=2 (483 obs) right son=3 (318 obs)
##   Primary splits:
##       LoyalCH   < 0.48285   to the right, improve=138.477600, (0 missing)
##       PriceDiff < 0.015     to the right, improve= 19.648970, (0 missing)
##       PriceMM   < 1.89      to the right, improve=  6.509978, (0 missing)
##       PriceCH   < 1.925     to the left,  improve=  1.893242, (0 missing)
##   Surrogate splits:
##       PriceMM   < 1.89      to the right, agree=0.612, adj=0.022, (0 split)
##       PriceDiff < -0.575    to the right, agree=0.608, adj=0.013, (0 split)
## 
## Node number 2: 483 observations,    complexity param=0.01461988
##   predicted class=CH  expected loss=0.1884058  P(node) =0.6029963
##     class counts:   392    91
##    probabilities: 0.812 0.188 
##   left son=4 (277 obs) right son=5 (206 obs)
##   Primary splits:
##       LoyalCH   < 0.705699  to the right, improve=16.467080, (0 missing)
##       PriceDiff < 0.015     to the right, improve=14.626090, (0 missing)
##       PriceMM   < 1.74      to the right, improve= 5.876976, (0 missing)
##       PriceCH   < 1.72      to the right, improve= 1.927536, (0 missing)
##   Surrogate splits:
##       PriceCH   < 1.755     to the right, agree=0.625, adj=0.121, (0 split)
##       PriceMM   < 2.04      to the right, agree=0.621, adj=0.112, (0 split)
##       PriceDiff < 0.015     to the right, agree=0.588, adj=0.034, (0 split)
## 
## Node number 3: 318 observations
##   predicted class=MM  expected loss=0.2106918  P(node) =0.3970037
##     class counts:    67   251
##    probabilities: 0.211 0.789 
## 
## Node number 4: 277 observations
##   predicted class=CH  expected loss=0.07581227  P(node) =0.3458177
##     class counts:   256    21
##    probabilities: 0.924 0.076 
## 
## Node number 5: 206 observations,    complexity param=0.01461988
##   predicted class=CH  expected loss=0.3398058  P(node) =0.2571785
##     class counts:   136    70
##    probabilities: 0.660 0.340 
##   left son=10 (114 obs) right son=11 (92 obs)
##   Primary splits:
##       PriceDiff < 0.145     to the right, improve=15.3040000, (0 missing)
##       PriceMM   < 2.155     to the right, improve= 4.0255650, (0 missing)
##       LoyalCH   < 0.6840375 to the left,  improve= 2.8009220, (0 missing)
##       PriceCH   < 1.72      to the right, improve= 0.7231615, (0 missing)
##   Surrogate splits:
##       PriceMM < 2.04      to the right, agree=0.723, adj=0.380, (0 split)
##       PriceCH < 1.755     to the right, agree=0.626, adj=0.163, (0 split)
##       LoyalCH < 0.6955035 to the left,  agree=0.568, adj=0.033, (0 split)
## 
## Node number 10: 114 observations
##   predicted class=CH  expected loss=0.1666667  P(node) =0.1423221
##     class counts:    95    19
##    probabilities: 0.833 0.167 
## 
## Node number 11: 92 observations,    complexity param=0.01461988
##   predicted class=MM  expected loss=0.4456522  P(node) =0.1148564
##     class counts:    41    51
##    probabilities: 0.446 0.554 
##   left son=22 (59 obs) right son=23 (33 obs)
##   Primary splits:
##       PriceDiff < -0.165    to the right, improve=4.2505640, (0 missing)
##       PriceCH   < 1.755     to the left,  improve=1.4818630, (0 missing)
##       LoyalCH   < 0.5039495 to the right, improve=1.3921350, (0 missing)
##       PriceMM   < 2.11      to the left,  improve=0.5869565, (0 missing)
##   Surrogate splits:
##       PriceCH < 1.91      to the left,  agree=0.783, adj=0.394, (0 split)
##       PriceMM < 2.11      to the left,  agree=0.717, adj=0.212, (0 split)
##       LoyalCH < 0.6111595 to the left,  agree=0.652, adj=0.030, (0 split)
## 
## Node number 22: 59 observations,    complexity param=0.01461988
##   predicted class=CH  expected loss=0.440678  P(node) =0.07365793
##     class counts:    33    26
##    probabilities: 0.559 0.441 
##   left son=44 (42 obs) right son=45 (17 obs)
##   Primary splits:
##       LoyalCH   < 0.5039495 to the right, improve=2.034326, (0 missing)
##       PriceMM   < 1.74      to the right, improve=1.538646, (0 missing)
##       PriceDiff < -0.09     to the left,  improve=1.374719, (0 missing)
##       PriceCH   < 1.72      to the right, improve=1.017856, (0 missing)
## 
## Node number 23: 33 observations
##   predicted class=MM  expected loss=0.2424242  P(node) =0.0411985
##     class counts:     8    25
##    probabilities: 0.242 0.758 
## 
## Node number 44: 42 observations
##   predicted class=CH  expected loss=0.3571429  P(node) =0.05243446
##     class counts:    27    15
##    probabilities: 0.643 0.357 
## 
## Node number 45: 17 observations
##   predicted class=MM  expected loss=0.3529412  P(node) =0.02122347
##     class counts:     6    11
##    probabilities: 0.353 0.647

Node number 4
277 observations
predicted class=CH expected loss=0.07581227 P(node) =0.3458177
class counts: 256 21
probabilities: 0.924 0.076

해당 노드는 전체 227개의 관측치를 포함하며, 이 중 256개가 ‘CH’ 클래스, 21개가 ‘MM’ 클래스이다.
이 노드는 ‘CH’ 클래스로 예측되고 있다.
이 노드의 예측 손실은 0.0758이고 약 7.58%의 관측치가 오분류되었음을 의미한다.
P(node) = 0.3458177 : 전체 데이터의 약 34.58%가 이 노드에 속하고 있다.

(d) Create a plot of the tree, and interpret the results.

tree_model_all
rpart.plot(tree_model_all)

해당 트리에 사용된 predictors : [LoyalCH, ListPriceDiff, PriceDiff]
root node : LoyalCH(기준 : 0.48) LoyalCH가 0.48보다 같거나 크다면, 오른쪽으로 그렇지 않다면 왼쪽으로 데이터를 분할한다. 총 5개의 leaf nodes가 존재하고, CH로 분류하는 단말노드는 3개, MM으로 분류하는 단발노드는 2개이다.

tree_model_selected
rpart.plot(tree_model_selected)

해당 트리에 사용된 predictors : [LoyalCH, PriceDiff]
root node : LoyalCH(기준 : 0.48) 이전의 모델과 동일하게 LoyalCH가 0.48보다 같거나 크다면, 오른쪽으로 그렇지 않다면 왼쪽으로 데이터를 분할한다. 총 6개의 leaf nodes가 존재하고, CH로 분류하는 단말노드는 3개, MM으로 분류하는 단발노드 역시 동일하게 3개이다.

(e) Predict the response on the test data, and produce a confusion matrix comparing the test labels to the predicted test labels. What is the test error rate?

confuson matrix(test set) : tree_model_all
test_conf_matrix_all
##     test_pred_all
##      CH MM
##   CH 17  4
##   MM  4 12
test_error_rate_all = 1 - sum(diag(test_conf_matrix_all)) / sum(test_conf_matrix_all)
cat("test error rate : ", test_error_rate_all)
## test error rate :  0.2162162
confusion matrix(test set) : tree_model_seleted
test_conf_matrix_selected
##     test_pred_all
##      CH MM
##   CH 17  4
##   MM  4 12
test_error_rate_selected = 1 - sum(diag(test_conf_matrix_selected)) / sum(test_conf_matrix_selected)
cat("test error rate : ", test_error_rate_selected)
## test error rate :  0.2162162

모든 변수를 사용해서 만든 모델과 변수를 선별해서 생성한 모델 간 유의미한 차이가 없기 때문에, 모든 이후로 모든 변수를 이용해서 생성한 모델에 대해서만 기술한다.

(f) Apply the cv.tree() function to the training set in order to determine the optimal tree size.

tree_model_cv = tree(Purchase ~ ., data = train_set)
cv_tree = cv.tree(tree_model_cv, FUN = prune.misclass)
print(cv_tree)
## $size
## [1] 8 6 2 1
## 
## $dev
## [1] 170 170 168 342
## 
## $k
## [1]   -Inf   0.00   7.25 184.00
## 
## $method
## [1] "misclass"
## 
## attr(,"class")
## [1] "prune"         "tree.sequence"
optimal_size = cv_tree$size[which.min(cv_tree$dev)]
cat("optimal_size: ", optimal_size)
## optimal_size:  2

(g) Produce a plot with tree size on the x-axis and cross-validated classification error rate on the y-axis.

cv_data = data.frame(size = cv_tree$size, error = cv_tree$dev /length(train_set$Purchase))
ggplot(cv_data, aes(x = size, y = error)) +
  geom_point() +
  geom_line() +
  labs(x = "Tree Size", y = "Cross-Validated Classification Error Rate") +
  theme_minimal()

rpat 패키지를 사용한 의사결정 모델
#create model
rpart_tree_model = rpart(Purchase ~ ., data = train_set, method = "class")
printcp(rpart_tree_model)
## 
## Classification tree:
## rpart(formula = Purchase ~ ., data = train_set, method = "class")
## 
## Variables actually used in tree construction:
## [1] ListPriceDiff LoyalCH       PriceDiff    
## 
## Root node error: 342/801 = 0.42697
## 
## n= 801 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.538012      0   1.00000 1.00000 0.040933
## 2 0.024854      1   0.46199 0.49415 0.033764
## 3 0.014620      3   0.41228 0.47661 0.033317
## 4 0.010000      4   0.39766 0.46199 0.032930
plotcp(rpart_tree_model)

(h) Which tree size corresponds to the lowest cross-validated classification error rate?

cv_data
##   size     error
## 1    8 0.2122347
## 2    6 0.2122347
## 3    2 0.2097378
## 4    1 0.4269663

tree size가 6 또는 8 일 때, error rate이 0.207으로 가장 낮았다. 다만, tree size가 1에서 2로 변화할 때, error rate이 크게 감소하고 이후에는 눈에 띄는 변화는 관찰되지 않으므로 variance를 낮추는 관점에서 size 2가 적절하다는 결론을 내린다.

(i) Produce a pruned tree corresponding to the optimal tree size obtained using cross-validation. If cross-validation does not lead to selection of a pruned tree, then create a pruned tree with five terminal nodes.

rpart_tree_model
optimal_cp = rpart_tree_model$cptable[which.min(rpart_tree_model$cptable[,"xerror"]),"CP"]
optimal_size = rpart_tree_model$cptable[which.min(rpart_tree_model$cptable[,"xerror"]),"nsplit"] + 1

cat("Optimal CP value:", optimal_cp, "\n")
## Optimal CP value: 0.01
cat("Optimal Tree Size:", optimal_size, "\n")
## Optimal Tree Size: 5

\(R_a(T)=R(T) + a|t|\) 위의 수식에서 \(a\)가 0.01 일 때, optimal이 된다.

before pruning
rpart.plot(rpart_tree_model, main = "Origin Tree")

after purning
pruned_rpart_tree_model <- prune(rpart_tree_model, cp = rpart_tree_model$cptable[which.min(abs(rpart_tree_model$cptable[, "nsplit"] + 1 -2)), "CP"])
#pruned_rpart_tree_model = prune(rpart_tree_model, cp = optimal_cp)
rpart.plot(pruned_rpart_tree_model)

(j) Compare the training error rates between the pruned and unpruned trees. Which is higher?

train_pred_unpruned = predict(rpart_tree_model, train_set, type = "class")
train_pred_pruned = predict(pruned_rpart_tree_model, train_set, type = "class")

train_conf_matrix_unpruned = table(train_set$Purchase, train_pred_unpruned)
train_conf_matrix_pruned = table(train_set$Purchase, train_pred_pruned)

# 훈련 오류율 계산
train_error_rate_unpruned = 1 - sum(diag(train_conf_matrix_unpruned)) / sum(train_conf_matrix_unpruned)
train_error_rate_pruned = 1 - sum(diag(train_conf_matrix_pruned)) / sum(train_conf_matrix_pruned)
cat("training error rates(unpruned) : ", train_error_rate_unpruned, " \n")
## training error rates(unpruned) :  0.1697878
cat("training error rates(pruned) : " , train_error_rate_pruned, " \n")
## training error rates(pruned) :  0.1972534

train set에 대해서는 pruned_model이 unpruned_model 보다 error rate이 높게 나타났다.

(k) Compare the test error rates between the pruned and unpruned trees. Which is higher?

test_pred_unpruned = predict(rpart_tree_model, test_set, type = "class")
test_pred_pruned = predict(pruned_rpart_tree_model, test_set, type = "class")

test_conf_matrix_unpruned = table(test_set$Purchase, test_pred_unpruned)
test_conf_matrix_pruned = table(test_set$Purchase, test_pred_pruned)

# 훈련 오류율 계산
test_error_rate_unpruned = 1 - sum(diag(test_conf_matrix_unpruned)) / sum(test_conf_matrix_unpruned)
test_error_rate_pruned = 1 - sum(diag(test_conf_matrix_pruned)) / sum(test_conf_matrix_pruned)
test_conf_matrix_unpruned
##     test_pred_unpruned
##      CH MM
##   CH 17  4
##   MM  4 12
test_conf_matrix_pruned
##     test_pred_pruned
##      CH MM
##   CH 17  4
##   MM  5 11
cat("test error rates(unpruned) : ", test_error_rate_unpruned, " \n")
## test error rates(unpruned) :  0.2162162
cat("test error rates(pruned) : " , test_error_rate_pruned, " \n")
## test error rates(pruned) :  0.2432432

test set에 대해서도 역시 pruned model이 높은 error rate을 보였다. test data가 충분하지 않아서 발생한 문제라고 추측한다. confusion matrix를 통해 정성적으로 파악해보면, 유의미한 차이를 찾을 수 없었다.

This question uses the Caravan data set.

(a) Create a training set consisting of the first 1,000 observations, and a test set consisting of the remaining observations.

#split the data into two part.(train set : test set)
data("Caravan")

Caravan = na.omit(Caravan)

dim(Caravan)
## [1] 5822   86
set.seed(123)

indices = sample(1:nrow(Caravan))

train_indices = indices[1:1000]
test_indices = indices[1001:length(indices)]

trainSet = Caravan[train_indices, ]
testSet = Caravan[test_indices, ]

trainSet$Purchase = as.factor(ifelse(trainSet$Purchase == "Yes", 1, 0))
testSet$Purchase = as.factor(ifelse(testSet$Purchase == "Yes", 1, 0))

nzv = nearZeroVar(trainSet[, -which(names(trainSet) == "Purchase")], saveMetrics = TRUE)
nzv_vars = rownames(nzv[nzv$nzv, ])

trainSet = trainSet[, !(names(trainSet) %in% nzv_vars)]
testSet = testSet[, !(names(testSet) %in% nzv_vars)]

(b) Fit a boosting model to the training set with Purchase as the response and the other variables as predictors. Use 1,000 trees, and a shrinkage value of 0.01. Which predictors appear to be the most important?

train_control = trainControl(method = "cv", number = 5)

boost.model = train(Purchase ~ ., 
                     data = trainSet, 
                     method = "gbm", 
                     trControl = train_control, 
                     verbose = FALSE, 
                     tuneGrid = expand.grid(n.trees = 1000, 
                                            interaction.depth = 1, 
                                            shrinkage = 0.01, 
                                            n.minobsinnode = 10))
print(boost.model)
## Stochastic Gradient Boosting 
## 
## 1000 samples
##   51 predictor
##    2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 799, 800, 800, 801, 800 
## Resampling results:
## 
##   Accuracy   Kappa       
##   0.9330043  -0.001874549
## 
## Tuning parameter 'n.trees' was held constant at a value of 1000
## 
## Tuning parameter 'shrinkage' was held constant at a value of 0.01
## 
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
importance = varImp(boost.model, scale = FALSE)
print(importance)
## gbm variable importance
## 
##   only 20 most important variables shown (out of 51)
## 
##          Overall
## PPERSAUT  68.350
## MOSTYPE   42.013
## MINK4575  30.811
## APERSAUT  21.938
## MGODPR    20.061
## MBERMIDD  18.859
## PBRAND    18.750
## MOSHOOFD  15.795
## MOPLLAAG  11.391
## MINK3045  10.814
## MINK7512  10.052
## MINKGEM    9.807
## MZFONDS    8.662
## MGODOV     8.558
## MSKB2      8.478
## MGODGE     8.384
## MINKM30    8.031
## MSKB1      7.899
## MFWEKIND   7.430
## MBERZELF   6.748

Caravan data set에서 Purchase 변수를 예측하는 데에 있어서 PPERSAUT 변수가 가장 중요한 것으로 나타난다.